Jenna Bittner
## IMPORT DATA
# Imports
import pandas as pd
# Load a sample of reviews to handle memory efficiently
df_reviews = pd.read_json(
'/fs/ess/PAS2038/PHYSICS_5680_OSU/project_data/goodreads/goodreads/complete/goodreads_reviews_dedup.json.gz',
nrows=100000,
lines=True
)
# Extract book_ids from the reviews
book_ids = set(df_reviews['book_id'].unique())
# Initialize a list to store filtered genre data chunks
filtered_genre_chunks = []
# Process genres data in chunks
for chunk in pd.read_json(
'/fs/ess/PAS2038/PHYSICS_5680_OSU/project_data/goodreads/goodreads/complete/goodreads_book_genres_initial.json.gz',
lines=True,
chunksize=10000
):
# Filter for rows with matching book_ids and non-empty genres
filtered_chunk = chunk[chunk['book_id'].isin(book_ids) & chunk['genres'].notna() & (chunk['genres'].str.len() > 0)]
# Filter to make sure that the first genre is not 'fiction' unless a second genre exists
filtered_chunk = filtered_chunk[filtered_chunk['genres'].apply(lambda x: len(x) > 1 if 'fiction' in x else True)]
filtered_genre_chunks.append(filtered_chunk)
# Combine all filtered chunks into one dataframe
df_genres = pd.concat(filtered_genre_chunks, ignore_index=True)
# Merge reviews and genres on book_id
df = pd.merge(df_reviews, df_genres, on='book_id', how='inner')
# Check the shape and missing data of the final merged dataframe
print('Shape of merged dataframe:', df.shape)
print()
print('Missing data summary:\n', df.isnull().sum())
print()
print('Number of rows with null genres:', df_genres['genres'].isnull().sum())
Shape of merged dataframe: (98050, 12) Missing data summary: user_id 0 book_id 0 review_id 0 rating 0 review_text 0 date_added 0 date_updated 0 read_at 0 started_at 0 n_votes 0 n_comments 0 genres 0 dtype: int64 Number of rows with null genres: 0
df.columns
Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
'n_comments', 'genres'],
dtype='object')
df.head()
| user_id | book_id | review_id | rating | review_text | date_added | date_updated | read_at | started_at | n_votes | n_comments | genres | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8842281e1d1347389f2ab93d60773d4d | 24375664 | 5cd416f3efc3f944fce4ce2db2290d5e | 5 | Mind blowingly cool. Best science fiction I've... | Fri Aug 25 13:55:02 -0700 2017 | Mon Oct 09 08:55:59 -0700 2017 | Sat Oct 07 00:00:00 -0700 2017 | Sat Aug 26 00:00:00 -0700 2017 | 16 | 0 | {'fiction': 1059, 'fantasy, paranormal': 114} |
| 1 | df889690c61e3d0ced39614a3b4a07c1 | 24375664 | 46301875c07d309c0f53cfcbfd247196 | 4 | It took me 2 months to read the first 200 page... | Tue May 24 13:50:59 -0700 2016 | Mon Oct 03 12:03:43 -0700 2016 | Mon Oct 03 12:03:43 -0700 2016 | Mon Aug 08 21:05:00 -0700 2016 | 0 | 0 | {'fiction': 1059, 'fantasy, paranormal': 114} |
| 2 | dd39f0a2f72aeb2d44c59f3e9f1ddda2 | 24375664 | 1c6d353e3054c9485b037f06e8a00570 | 3 | 3.5/5.0 | Sat Jul 02 03:09:16 -0700 2016 | Sat Jul 02 03:10:16 -0700 2016 | Sat Jul 02 03:10:16 -0700 2016 | 0 | 0 | {'fiction': 1059, 'fantasy, paranormal': 114} | |
| 3 | 8842281e1d1347389f2ab93d60773d4d | 18245960 | dfdbb7b0eb5a7e4c26d59a937e2e5feb | 5 | This is a special book. It started slow for ab... | Sun Jul 30 07:44:10 -0700 2017 | Wed Aug 30 00:00:26 -0700 2017 | Sat Aug 26 12:05:52 -0700 2017 | Tue Aug 15 13:23:18 -0700 2017 | 28 | 1 | {'fiction': 393, 'fantasy, paranormal': 341, '... |
| 4 | df889690c61e3d0ced39614a3b4a07c1 | 18245960 | 198f663bbbb2926f8ad723796eef23ed | 0 | If you like Contact, you'll love this \n This ... | Tue May 24 13:54:43 -0700 2016 | Tue May 24 13:54:43 -0700 2016 | 2 | 0 | {'fiction': 393, 'fantasy, paranormal': 341, '... |
## Distribution of genres
# Imports
import plotly.express as px
# Define a function to get the first genre
def get_first_genre(genre_dict):
if isinstance(genre_dict, dict) and genre_dict: # Check if it's a non-empty dictionary
return next(iter(genre_dict.keys())) # Get the first key (genre)
return None # Return None if it's not a dictionary or is empty
# Apply the function to create the primary genre column
df['primary_genre'] = df['genres'].apply(get_first_genre)
# Count of primary genres
genre_counts = df['primary_genre'].value_counts().reset_index()
genre_counts.columns = ['primary_genre', 'count']
# Plot the top genres
fig = px.bar(
genre_counts.head(10),
x='primary_genre',
y='count',
title='Top 10 Genres in Dataset',
labels={'primary_genre': 'Genre', 'count': 'Count'},
color='count',
color_continuous_scale='Viridis'
)
# Update layout
fig.update_layout(
xaxis_title='Genre',
yaxis_title='Count',
xaxis_tickangle=-45,
width=900, # Increase figure width
height=500, # Increase figure height
margin=dict(l=60, r=60, t=60, b=100) # Adjust margins for more spacing
)
fig.show()
## Distribution of secondary genres
# Define a function to get the second genre if it exists
def get_second_genre(genre_dict):
if isinstance(genre_dict, dict) and len(genre_dict) > 1: # Check if it's a dictionary with at least two items
return list(genre_dict.keys())[1] # Get the second key (subgenre)
return None # Return None if it's not a dictionary or doesn't have a second genre
# Apply the function to create the subgenre column
df['secondary_genre'] = df['genres'].apply(get_second_genre)
# Count of subgenres
secondary_genre_counts = df['secondary_genre'].value_counts().reset_index()
secondary_genre_counts.columns = ['secondary_genre', 'count']
# Plot the top subgenres
fig = px.bar(
secondary_genre_counts.head(10),
x='secondary_genre',
y='count',
title='Top 10 Subgenres in Dataset',
labels={'secondary_genre': 'Secondary Genre', 'count': 'Count'},
color='count',
color_continuous_scale='Viridis'
)
# Update layout
fig.update_layout(
xaxis_title='Secondary Genre',
yaxis_title='Count',
xaxis_tickangle=-45,
width=900,
height=500,
margin=dict(l=60, r=60, t=60, b=100)
)
fig.show()
## Distribution of third genres, if they exist
# Function to get the third genre, if it exists
def get_third_genre(genre_dict):
if isinstance(genre_dict, dict) and len(genre_dict) >= 3: # Check if it's a dictionary with at least 3 genres
return list(genre_dict.keys())[2] # Get the third key (genre)
return None # Return None if there are fewer than 3 genres
# Apply the function to create the third genre column
df['third_genre'] = df['genres'].apply(get_third_genre)
# Count of third genres
third_genre_counts = df['third_genre'].value_counts().reset_index()
third_genre_counts.columns = ['third_genre', 'count']
# Plot the top third genres
fig = px.bar(third_genre_counts.head(10),
x='third_genre',
y='count',
title='Top 10 Third Genres in Dataset',
labels={'third_genre': 'Third Genre', 'count': 'Count'},
color='count',
color_continuous_scale='Viridis')
# Update layout
fig.update_layout(
xaxis_title='Third Genre',
yaxis_title='Count',
xaxis_tickangle=-45,
width=900,
height=500,
margin=dict(l=60, r=60, t=60, b=100)
)
fig.show()
## Average rating for each genre
# Calculate average rating for each genre
avg_rating_by_genre = df.groupby('primary_genre')['rating'].mean().reset_index().sort_values(by='rating', ascending=False)
# Plot average rating by genre
fig = px.bar(avg_rating_by_genre.head(10),
x='primary_genre',
y='rating',
title='Top 10 Genres by Average Rating',
labels={'primary_genre': 'Genre', 'rating': 'Average Rating'},
color='rating',
color_continuous_scale='purp')
# Update layout
fig.update_layout(
xaxis_title='Genre',
yaxis_title='Average Rating',
xaxis_tickangle=-45,
width=800,
height=500,
margin=dict(l=40, r=40, t=60, b=100)
)
fig.show()
## Distribution of ratings
# Plot distribution of ratings
fig = px.histogram(df,
x='rating',
nbins=10,
title='Distribution of Ratings',
labels={'rating': 'Rating'},
color_discrete_sequence=['skyblue'])
# Update traces
fig.update_traces(opacity=0.85)
# Update layout
fig.update_layout(
xaxis_title='Rating',
yaxis_title='Count',
width=800,
height=500,
title_x=0.5,
xaxis=dict(gridcolor='lightgray'),
yaxis=dict(gridcolor='lightgray'),
margin=dict(l=40, r=40, t=60, b=100)
)
fig.show()
## Average length of review by genre
# Add a column for word count in reviews
df['review_word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))
# Plot average review word count by genre
avg_word_count_by_genre = df.groupby('primary_genre')['review_word_count'].mean().reset_index().sort_values(by='review_word_count', ascending=False)
fig = px.bar(avg_word_count_by_genre.head(10),
x='primary_genre',
y='review_word_count',
title='Average Review Word Count by Genre',
labels={'primary_genre': 'Genre', 'review_word_count': 'Average Word Count'},
color='review_word_count',
color_continuous_scale='Sunsetdark')
# Update layout
fig.update_traces(opacity=0.85)
fig.update_layout(
xaxis_title='Genre',
yaxis_title='Average Word Count',
title_x=0.5,
width=800,
height=500,
xaxis=dict(tickangle=-45, title_standoff=10),
margin=dict(l=40, r=40, t=60, b=100)
)
fig.show()
# ## Average rating over time
# # Imports
# import time
# # Convert 'date_added' to datetime and handle errors by setting invalid values to NaT
# df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
# # Check for any invalid dates (NaT)
# invalid_dates = df[df['date_added'].isna()]
# # Drop rows where 'date_added' is NaT (invalid dates)
# df = df.dropna(subset=['date_added'])
# # Extract the year from 'date_added'
# df['year_added'] = df['date_added'].dt.year
# # Calculate average rating by year
# avg_rating_by_year = df.groupby('year_added')['rating'].mean().reset_index()
# # Create line plot for average rating over time
# fig = px.line(avg_rating_by_year,
# x='year_added',
# y='rating',
# title='Average Rating Over Time',
# labels={'year_added': 'Year', 'rating': 'Average Rating'},
# markers=True)
# fig.update_layout(xaxis_title='Year', yaxis_title='Average Rating')
# fig.show()
## REPLACE THE FIRST GENRE 'FICTION' WITH ITS SECONDARY GENRE
# Imports
from collections import Counter
# Count the number of books where the primary genre is 'fiction' and the secondary genre is None (or empty)
num_fiction_with_no_secondary_genre = df[(df['primary_genre'] == 'fiction') &
(df['secondary_genre'].isna() | (df['secondary_genre'] == ''))].shape[0]
print(f"Number of 'fiction' books with no secondary genre: {num_fiction_with_no_secondary_genre}")
print()
# Replace the primary genre with the secondary genre where the primary genre is 'fiction'
df.loc[df['primary_genre'] == 'fiction', 'primary_genre'] = df['secondary_genre']
# Check new genres
genre_counts = Counter(df['primary_genre'])
for genre, count in genre_counts.items():
print(f"{genre}: {count}")
print()
# Check for None or NaN values in the primary_genre column
nan_rows = df[df['primary_genre'].isnull()]
print(f"Number of rows with missing primary genre: {nan_rows.shape[0]}")
Number of 'fiction' books with no secondary genre: 0 fantasy, paranormal: 18764 mystery, thriller, crime: 11672 history, historical fiction, biography: 12796 non-fiction: 11715 children: 4128 young-adult: 11254 romance: 20321 comics, graphic: 6369 poetry: 1031 Number of rows with missing primary genre: 0
## CLEAN REVIEW DATA
# Imports
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
# Define the stop words set
stop_words = set(stopwords.words('english'))
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
# Preprocessing function
def preprocess_review(review_text):
# Lowercase and tokenize
tokens = word_tokenize(review_text.lower())
# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]
# Lemmatize tokens
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Remove non-alphanumeric characters
tokens = [re.sub(r'\W+', '', token) for token in tokens if re.sub(r'\W+', '', token) != '']
# Remove numbers
tokens = [token for token in tokens if not token.isdigit()]
# Remove short words (length <= 2)
tokens = [token for token in tokens if len(token) > 2]
return tokens
# Apply the preprocessing function to the reviews
df['final_reviews'] = df['review_text'].apply(preprocess_review)
# Drop duplicates
df = df.drop_duplicates(subset=['final_reviews'])
# View a result
print(df.final_reviews[0], '\n')
[nltk_data] Downloading package stopwords to [nltk_data] /users/PAS2038/bittner87/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] /users/PAS2038/bittner87/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package omw-1.4 to [nltk_data] /users/PAS2038/bittner87/nltk_data... [nltk_data] Package omw-1.4 is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] /users/PAS2038/bittner87/nltk_data... [nltk_data] Package punkt is already up-to-date!
['mind', 'blowingly', 'cool', 'best', 'science', 'fiction', 'read', 'time', 'loved', 'description', 'society', 'future', 'lived', 'tree', 'notion', 'owning', 'property', 'even', 'getting', 'married', 'gone', 'every', 'surface', 'screen', 'undulation', 'society', 'responds', 'trisolaran', 'threat', 'seem', 'surprising', 'maybe', 'chinese', 'perspective', 'would', 'thought', 'eto', 'would', 'exist', 'book', 'would', 'thought', 'people', 'would', 'get', 'overconfident', 'primitive', 'fleet', 'chance', 'given', 'think', 'superior', 'science', 'would', 'weapon', 'defense', 'would', 'rifle', 'arrow', 'moment', 'luo', 'wallfacer', 'cool', 'may', 'actually', 'done', 'fist', 'pump', 'though', 'way', 'dark', 'forest', 'theory', 'right', 'see', 'reason', 'would', 'society', 'probably', 'stop', 'broadcasting', 'much', 'signal', 'universe']
## TF-IDF VECTORIZATION
# Imports
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000) # Set a limit on the number of features
# Apply TF-IDF vectorization to final_reviews
df['review_text_str'] = df['final_reviews'].apply(lambda x: ' '.join(x))
# Fit and transform the reviews
tfidf_matrix = vectorizer.fit_transform(df['review_text_str'])
# Convert the TF-IDF matrix to a df for inspection
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print('Shape of the TF-IDF matrix:', tfidf_matrix.shape)
print()
print(f'TF-IDF DataFrame head:\n{tfidf_df.head()}')
Shape of the TF-IDF matrix: (94693, 5000) TF-IDF DataFrame head: 1960s 1970s 19th 1st 20th 2nd 3rd 4th 5th aaron ... \ 0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 ... 1 0.0 0.0 0.0 0.0 0.0 0.0 0.136481 0.0 0.0 0.0 ... 2 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 ... 3 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 ... 4 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 ... youngest youth youtube yummy zach zane zero zoe zombie zone 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 [5 rows x 5000 columns]
## SENTIMENT ANALYSIS
# Imports
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()
# Get sentiment score
def get_sentiment_score(tokens):
# Check if tokens is empty or None
if not tokens or tokens is None:
return np.nan # Return NaN if no valid tokens
# Join tokens back into a string for sentiment analysis
review_text = ' '.join(tokens)
sentiment_score = sia.polarity_scores(review_text)
return sentiment_score['compound']
# Apply sentiment analysis to the 'final_reviews' column
df['sentiment_score'] = df['final_reviews'].apply(get_sentiment_score)
# Verify the results
print(df[['final_reviews', 'sentiment_score']].head())
[nltk_data] Downloading package vader_lexicon to [nltk_data] /users/PAS2038/bittner87/nltk_data... [nltk_data] Package vader_lexicon is already up-to-date!
final_reviews sentiment_score 0 [mind, blowingly, cool, best, science, fiction... 0.9186 1 [took, month, read, first, page, book, took, l... 0.9738 2 [] NaN 3 [special, book, started, slow, first, third, m... 0.9764 4 [like, contact, love, book, unfolds, like, mys... 0.8481
## Distribution of sentiment scores
# Plot histogram of sentiment scores
fig = px.histogram(df, x='sentiment_score', nbins=30, title='Distribution of Sentiment Scores',
labels={'sentiment_score': 'Sentiment Score'})
fig.update_layout(xaxis_title='Sentiment Score', yaxis_title='Frequency')
fig.show()
## Average sentiment score per primary genre
# Calculate average sentiment score for each genre
avg_sentiment_by_genre = df.groupby('primary_genre')['sentiment_score'].mean().reset_index()
# Bar chart for average sentiment by genre
fig = px.bar(avg_sentiment_by_genre, x='primary_genre', y='sentiment_score',
title='Average Sentiment Score by Genre',
labels={'primary_genre': 'Primary Genre', 'sentiment_score': 'Average Sentiment Score'},
color='sentiment_score',
color_continuous_scale='tealgrn')
# Update layout
fig.update_traces(opacity=0.85)
fig.update_layout(
xaxis_title='Primary Genre',
yaxis_title='Average Sentiment Score',
title_x=0.5,
width=800,
height=500,
xaxis=dict(tickangle=-45, title_standoff=10),
margin=dict(l=40, r=40, t=60, b=100)
)
fig.show()
## Sentiment vs review length
# Calculate review length
df['review_length'] = df['final_reviews'].apply(len)
# Scatter plot of sentiment vs. review length
fig = px.scatter(df, x='review_length', y='sentiment_score',
title='Sentiment Score vs. Review Length',
labels={'review_length': 'Review Length', 'sentiment_score': 'Sentiment Score'},
opacity=0.6,
color='sentiment_score',
color_continuous_scale='spectral')
# Update layout for better readability
fig.update_traces(opacity=0.85)
fig.update_layout(
xaxis_title='Review Length',
yaxis_title='Sentiment Score',
title_x=0.5,
width=800,
height=500,
xaxis=dict(tickangle=-45, title_standoff=10),
margin=dict(l=40, r=40, t=60, b=100)
)
fig.show()
# ## Sentiment over time
# # Ensure 'date_added' is a datetime object
# df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
# # Drop rows with invalid 'date_added' values
# df = df.dropna(subset=['date_added'])
# # Extract the year for grouping
# df['year_added'] = df['date_added'].dt.year
# # Calculate average sentiment over time (by year)
# avg_sentiment_by_year = df.groupby('year_added')['sentiment_score'].mean().reset_index()
# # Line chart of sentiment over time
# fig = px.line(avg_sentiment_by_year, x='year_added', y='sentiment_score',
# title='Average Sentiment Score Over Time',
# labels={'year_added': 'Year', 'sentiment_score': 'Average Sentiment Score'},
# markers=True)
# fig.update_layout(xaxis_title='Year', yaxis_title='Average Sentiment Score')
# fig.show()
## DEFINE, ENCODE, SPLIT
# Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Define X and y
X = tfidf_matrix # vectorized reviews
y = df['primary_genre'] # target: primary genre
# Encode the target labels (genres)
le = LabelEncoder()
y = le.fit_transform(y)
# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
## SMOTE - Synthetic minority over-sampling
# Imports
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
# Apply SMOTE to the training set
smote = SMOTE(sampling_strategy='auto', k_neighbors=3, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
X_train, y_train = X_train_smote, y_train_smote
## CREATE AND TRAIN MODEL
# Imports
from sklearn.ensemble import RandomForestClassifier
# Initialize the model
model = RandomForestClassifier(
n_estimators=500, # Number of trees
max_depth=None, # Allow trees to grow as deep as they want
min_samples_split=2, # Minimum samples to split
min_samples_leaf=4, # Minimum samples at leaf
max_features='log2', # Number of features to consider at each split
bootstrap=True, # Use bootstrap sampling
criterion='gini', # Quality of split
random_state=42 # For reproducibility
)
# Train the model
model.fit(X_train, y_train)
RandomForestClassifier(max_features='log2', min_samples_leaf=4,
n_estimators=500, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestClassifier(max_features='log2', min_samples_leaf=4,
n_estimators=500, random_state=42)## CLASSIFICATION REPORT AND CONFUSION MATRIX
# Imports
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model performance
print('Accuracy:', accuracy_score(y_test, y_pred))
print()
print('Classification Report:\n', classification_report(y_test, y_pred, target_names=le.classes_))
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Create a confusion matrix display
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
# Customize the plot
fig, ax = plt.subplots(figsize=(8,8))
disp.plot(ax=ax, xticks_rotation='vertical')
# Show the plot
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()
Accuracy: 0.5065209356354612
Classification Report:
precision recall f1-score support
children 0.45 0.68 0.54 819
comics, graphic 0.56 0.75 0.64 1212
fantasy, paranormal 0.51 0.42 0.46 3598
history, historical fiction, biography 0.52 0.33 0.40 2478
mystery, thriller, crime 0.49 0.46 0.47 2181
non-fiction 0.53 0.60 0.56 2345
poetry 0.14 0.66 0.23 191
romance 0.57 0.65 0.61 3926
young-adult 0.46 0.31 0.37 2189
accuracy 0.51 18939
macro avg 0.47 0.54 0.48 18939
weighted avg 0.52 0.51 0.50 18939
## ROC Curve
# Imports
from sklearn.metrics import roc_curve, auc
# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1], pos_label=1)
# Compute AUC
roc_auc = auc(fpr, tpr)
# Plot ROC curve
fig_roc = px.area(x=fpr, y=tpr,
title=f'ROC Curve (AUC = {roc_auc:.2f})',
labels={'x': 'False Positive Rate', 'y': 'True Positive Rate'},
height=400)
fig_roc.update_traces(fillcolor="rgba(157, 211, 144, 0.4)",
line_color="rgba(157, 211, 144, 1)",
opacity=0.4)
# Add random classifier
fig_roc.add_scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random Classifier', line=dict(dash='dash', color='rgb(252, 115, 212)'))
fig_roc.show()
## Precision Recall Curve
# Imports
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import label_binarize
import plotly.express as px
# Binarize the labels for multiclass precision-recall computation
y_test_bin = label_binarize(y_test, classes=range(len(le.classes_)))
# Initialize the plot
fig_pr = px.area(title='Precision-Recall Curve',
labels={'x': 'Recall', 'y': 'Precision'},
height=600)
# Loop through each class to calculate precision, recall, and thresholds
for i in range(y_test_bin.shape[1]): # Iterate over each class
precision, recall, _ = precision_recall_curve(y_test_bin[:, i], model.predict_proba(X_test)[:, i])
# Add each class curve to the plot
fig_pr.add_scatter(x=recall, y=precision, mode='lines', name=f'{le.classes_[i]}')
# Update axis labels
fig_pr.update_layout(
xaxis_title='Recall',
yaxis_title='Precision',
height=600
)
fig_pr.show()
## Feature Importance
# Get the feature importances
importances = model.feature_importances_
# Get the feature names (these correspond to the TF-IDF features)
features = vectorizer.get_feature_names_out()
# Create a df for feature importance
feat_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
# Sort the features by importance
feat_importance_df = feat_importance_df.sort_values(by='Importance', ascending=False)
# Plot feature importances
fig_feat_importance = px.bar(feat_importance_df.head(20),
x='Feature',
y='Importance',
title='Top 20 Features',
labels={'Feature': 'Feature', 'Importance': 'Importance'},
height=400,
color='Importance',
color_continuous_scale='tealgrn',
range_color=[0, max(feat_importance_df['Importance'])]
)
# Update layourt
fig_feat_importance.update_layout(
xaxis_tickangle=45,
xaxis={'tickfont': {'size': 12}},
yaxis_title='Importance',
yaxis={'tickfont': {'size': 14}},
coloraxis_colorbar=dict(title='Importance'),
title={'font': {'size': 16}},
showlegend=False
)
fig_feat_importance.show()